1 Effect of UPSTM-Based Decorrelation on Feature Discovery

Here I showcase of to use BSWiMS feature selection/modeling function coupled with Goal Driven Sparse Transformation Matrix (UPSTM) as a pre-processing step to decorrelate highly correlated features. The aim(s) are:

  1. To improve model performance by uncovering the hidden information between correlated features.

  2. To simplify the interpretation of the machine learning models.

This demo will use:

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Data Source https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6960825/

Mohino-Herranz I, Gil-Pita R, Rosa-Zurera M, Seoane F. Activity Recognition Using Wearable Physiological Measurements: Selection of Features from a Comprehensive Literature Study. Sensors (Basel). 2019 Dec 13;19(24):0. doi: 10.3390/s19245524. PMID: 31847261; PMCID: PMC6960825.

1.2 The Data

Activitydata <- read.csv("~/GitHub/LatentBiomarkers/Data/ActivityData/data.txt", header=FALSE, stringsAsFactors=TRUE)
featNames <- read.table("~/GitHub/LatentBiomarkers/Data/ActivityData/Featurelabels.txt", quote="\"", comment.char="")
featNames <- as.character(t(featNames))
featNames <- str_replace_all(featNames,"\\(abs\\)","_A_")
featNames[duplicated(featNames)] <- paste(featNames[duplicated(featNames)],"D",sep="_")

rep_ID <- numeric(nrow(Activitydata))
ctr <- 1
for (ind in c(1:(nrow(Activitydata)-1)))
{
  rep_ID[ind] <- ctr
  if (Activitydata$V1[ind] != Activitydata$V1[ind+1]) ctr <- 0;
  ctr <- ctr + 1
}
rownames(Activitydata) <- paste(Activitydata$V1,rep_ID,sep="_")
colnames(Activitydata) <- c("ID",featNames,"class")
Activitydata$rep <- rep_ID
  
tb <- table(Activitydata$class)

classes <- c("Neu","Emo","Men","Phy")
names(classes) <- names(tb)
Activitydata$class <- classes[as.character(Activitydata$class)]
table(Activitydata$class)
#> 
#>  Emo  Men  Neu  Phy 
#> 1120 1120 1120 1120



#ID_class <- paste(Activitydata$ID,Activitydata$class)
#Activitydata <- Activitydata[!duplicated(ID_class),]
table(Activitydata$class)
#> 
#>  Emo  Men  Neu  Phy 
#> 1120 1120 1120 1120

nsub <- unique(Activitydata$ID)
trainFraction <- 0.6
trainSample <- sample(length(nsub),length(nsub)*trainFraction)
trainDataFrame <- Activitydata[Activitydata$ID %in% nsub[trainSample],]
testDataFrame <- Activitydata[Activitydata$ID %in% nsub[-trainSample],]

trainDataFrame <- trainDataFrame[,c(featNames,"class")]
testDataFrame <- testDataFrame[,c(featNames,"class")]
sdiszero <- c(apply(trainDataFrame[,featNames],2,var) > 1.0e-16,TRUE)
trainDataFrame <- trainDataFrame[,sdiszero]
testDataFrame <- testDataFrame[,sdiszero]


varlist <- colnames(trainDataFrame)
varlist <- varlist[varlist != "class"]

tokeep <- c(as.character(correlated_Remove(trainDataFrame,varlist,thr=0.9999)),"class")
trainDataFrame <- trainDataFrame[,tokeep]
testDataFrame <- testDataFrame[,tokeep]

trainScale <- FRESAScale(trainDataFrame,method="OrderLogit")
trainDataFrame <- trainScale$scaledData
table(trainDataFrame$class)
#> 
#> Emo Men Neu Phy 
#> 672 672 672 672

testDataFrame <- FRESAScale(testDataFrame,method="OrderLogit",refMean=trainScale$refMean,refDisp=trainScale$refDisp)$scaledData
table(testDataFrame$class)
#> 
#> Emo Men Neu Phy 
#> 448 448 448 448

1.2.0.1 Standarize the names for the reporting

outcome <- "class"

trainFraction <- 0.7
rhoThreshold <- 0.6
TopVariables <- 10
aucTHR <- 0.75


#set.seed(5)
#trainSample <- sample(nrow(dataframe),nrow(dataframe)*trainFraction)
#trainDataFrame <- dataframe[trainSample,]
#testDataFrame <- dataframe[-trainSample,]

1.2.1 Data specs

pander::pander(table(trainDataFrame[,outcome]))
Emo Men Neu Phy
672 672 672 672
pander::pander(table(testDataFrame[,outcome]))
Emo Men Neu Phy
448 448 448 448

varlist <- colnames(trainDataFrame)
varlist <- varlist[varlist != outcome]

1.3 The heatmap of the data


hmdf <- testDataFrame;
hmdf$class <- as.numeric(as.factor(testDataFrame$class))


hm <- heatMaps(data=hmdf,
               Outcome="class",
               Scale=FALSE,
               hCluster = "row",
               xlab="Feature",
               ylab="Sample",
               cexCol=0.15,
               cexRow=0.25
               )

par(op)

1.3.1 The UMAP

raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(trainDataFrame[,varlist],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=trainDataFrame$class,col=raincolors[trainDataFrame$class])

1.3.1.1 Correlation Matrix of the Decorrelated Test Data

The heat map of the testing set.


par(cex=0.6,cex.main=0.85,cex.axis=0.7)
cormat <- cor(testDataFrame[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0
gplots::heatmap.2(abs(cormat),
                  trace = "none",
#                  scale = "row",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Original Correlation",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")

1.4 The decorrelation

trainDecor <- IDeA(trainDataFrame,thr=rhoThreshold,verbose=TRUE)
#> 
#>  Included: 323 , Uni p: 0.01536768 , Uncorrelated Base: 54 , Outcome-Driven Size: 0 , Base Size: 54 
#> 
#> 
 1 <R=1.000,w=  1,N=  209>, Top: 47( 10 )[ 1 : 47 : 0.975 ]( 46 , 151 , 0 ),<|>Tot Used: 197 , Added: 151 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,w=  1,N=  209>, Top: 14( 3 )[ 1 : 14 : 0.975 ]( 14 , 27 , 46 ),<|>Tot Used: 202 , Added: 27 , Zero Std: 0 , Max Cor: 1.000
#> 
 3 <R=1.000,w=  1,N=  209>, Top: 1( 1 )[ 1 : 1 : 0.975 ]( 1 , 1 , 60 ),<|>Tot Used: 202 , Added: 1 , Zero Std: 0 , Max Cor: 0.974
#> 
 4 <R=0.974,w=  2,N=   75>, Top: 30( 2 )[ 1 : 30 : 0.937 ]( 30 , 38 , 61 ),<|>Tot Used: 234 , Added: 38 , Zero Std: 0 , Max Cor: 0.983
#> 
 5 <R=0.983,w=  2,N=   75>, Top: 5( 1 )[ 1 : 5 : 0.941 ]( 5 , 5 , 75 ),<|>Tot Used: 234 , Added: 5 , Zero Std: 0 , Max Cor: 0.935
#> 
 6 <R=0.935,w=  3,N=   84>, Top: 33( 1 )[ 1 : 33 : 0.868 ]( 33 , 43 , 79 ),<|>Tot Used: 257 , Added: 43 , Zero Std: 0 , Max Cor: 0.951
#> 
 7 <R=0.951,w=  3,N=   84>, Top: 4( 1 )[ 1 : 4 : 0.875 ]( 4 , 5 , 96 ),<|>Tot Used: 257 , Added: 5 , Zero Std: 0 , Max Cor: 0.867
#> 
 8 <R=0.867,w=  4,N=   85>, Top: 34( 1 )[ 1 : 34 : 0.784 ]( 34 , 43 , 99 ),<|>Tot Used: 270 , Added: 43 , Zero Std: 0 , Max Cor: 0.946
#> 
 9 <R=0.946,w=  4,N=   85>, Top: 4( 1 )[ 1 : 4 : 0.823 ]( 4 , 4 , 116 ),<|>Tot Used: 270 , Added: 4 , Zero Std: 0 , Max Cor: 0.929
#> 
 10 <R=0.929,w=  5,N=   85>, Top: 38( 1 )[ 1 : 38 : 0.715 ]( 37 , 46 , 119 ),<|>Tot Used: 284 , Added: 46 , Zero Std: 0 , Max Cor: 0.986
#> 
 11 <R=0.986,w=  5,N=   85>, Top: 5( 1 )[ 1 : 5 : 0.743 ]( 5 , 5 , 139 ),<|>Tot Used: 285 , Added: 5 , Zero Std: 0 , Max Cor: 0.735
#> 
 12 <R=0.735,w=  6,N=  104>, Top: 44( 3 )[ 1 : 44 : 0.600 ]( 43 , 51 , 140 ),<|>Tot Used: 295 , Added: 51 , Zero Std: 0 , Max Cor: 0.677
#> 
 13 <R=0.677,w=  6,N=  104>, Top: 8( 1 )[ 1 : 8 : 0.600 ]( 7 , 8 , 150 ),<|>Tot Used: 295 , Added: 8 , Zero Std: 0 , Max Cor: 0.599
#> 
 14 <R=0.000,w=  7,N=    0>
#> 
 [ 14 ], 0.5991722 Decor Dimension: 295 . Cor to Base: 167 , ABase: 32 , Outcome Base: 0 
#> 
testDecor <- predictDecorrelate(trainDecor,testDataFrame)
varlistc <- colnames(testDecor)[colnames(testDecor) != outcome]

pander::pander(sum(apply(testDataFrame[,varlist],2,var)))

592

pander::pander(sum(apply(testDecor[,varlistc],2,var)))

168

pander::pander(entropy(discretize(unlist(testDataFrame[,varlist]), 256)))

4.88

pander::pander(entropy(discretize(unlist(testDecor[,varlistc]), 256)))

2.85


cormat <- cor(testDecor[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Correlation after IDeA",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")


par(op)

1.5 The heatmap of the decorrelated data

hmdf <- testDecor;
hmdf$class <- as.numeric(as.factor(testDecor$class))

hm <- heatMaps(data=hmdf,
               Outcome="class",
               Scale=TRUE,
               hCluster = "row",
               cexRow = 0.15,
               cexCol = 0.15,
               xlab="Feature",
               ylab="Sample")

par(op)

1.5.1 The decorralted UMAP


datasetframe.umap = umap(trainDecor[,varlistc],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=trainDecor$class,col=raincolors[trainDecor$class])

1.5.2 The decorrelation matrix


par(cex=0.6,cex.main=0.85,cex.axis=0.7)

UPSTM <- attr(trainDecor,"UPSTM")

gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Decorrelation matrix",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="|Beta|>0",
                  xlab="Output Feature", ylab="Input Feature")


par(op)

1.5.3 Univariate


btrain <- testDataFrame
btrain$class <- 1.0*(btrain$class == classes[1])
univar <- uniRankVar(varlist,
               "class~1",
               "class",
               btrain,
               rankingTest="AUC")

100 : ECG_p_HF_prctile75 200 : EDA_Original_harmmean 300 : EDA_p_samples_D


rawadjTrainpvalues <- univariate_Wilcoxon(trainDataFrame,outcome,pvalue=0.01)
rawadjTestpvalues <- univariate_Wilcoxon(testDataFrame,outcome,pvalue=0.01)
pander::pander(c(train=length(rawadjTrainpvalues)))
train
135
pander::pander(c(test=length(rawadjTestpvalues)))
test
151
jaccard <- sum(names(rawadjTestpvalues) %in% names(rawadjTrainpvalues))/length(unique(c(names(rawadjTrainpvalues),names(rawadjTestpvalues))))
pander::pander(c(Raw_Jaccard=jaccard))
Raw_Jaccard
0.653

btest <- testDecor
btest$class <- 1.0*(btest$class == classes[1])
univarDe <- uniRankVar(varlistc,
               "class~1",
               "class",
               btest,
               rankingTest="AUC",
               )

100 : La_ECG_p_HF_prctile75 200 : La_EDA_Original_harmmean 300 : La_EDA_p_samples_D


deadjTrainpvalues <- univariate_Wilcoxon(trainDecor,outcome,pvalue=0.01)
deadjTestpvalues <- univariate_Wilcoxon(testDecor,outcome,pvalue=0.01)
pander::pander(c(train=length(deadjTrainpvalues)))
train
235
pander::pander(c(test=length(deadjTestpvalues)))
test
242
jaccard <- sum(names(deadjTestpvalues) %in% names(deadjTrainpvalues))/length(unique(c(names(deadjTrainpvalues),names(deadjTestpvalues))))
pander::pander(c(De_Jaccard=jaccard))
De_Jaccard
0.704

1.5.4 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univar$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
IT_CCV_LF 2.635 1.897 -0.1785 0.605 0 0.863
IT_CCV_HF 2.635 1.897 -0.1785 0.605 0 0.863
IT_BRV_min -0.436 0.549 0.3369 0.412 0 0.857
IT_BRV_std 2.619 1.919 -0.2036 0.588 0 0.854
IT_BRV_mad 2.619 1.919 -0.2036 0.588 0 0.854
IT_PSD_mean 2.679 1.852 -0.0523 0.474 0 0.854
IT_PSD_trimmean25 2.679 1.852 -0.0523 0.474 0 0.854
IT_PSD_min 2.679 1.852 -0.0523 0.474 0 0.854
IT_VLF_std 2.679 1.852 -0.0523 0.474 0 0.854
IT_LF_std_D 2.679 1.852 -0.0523 0.474 0 0.854


pwilvalue <- univarDe$orderframe$FRes.p
valroc <- univarDe$orderframe$ROCAUC > aucTHR & topvar



finalTable <- univarDe$orderframe[valroc,univariate_columns]

dc <- getLatentCoefficients(trainDecor)
fscores <- attr(trainDecor,"fscore")

theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
finalTable$pvalue <- deadjTestpvalues[rownames(finalTable)]
  
pander::pander(finalTable)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC DecorFormula fscores pvalue
IT_VLF_std 2.68e+00 1.85e+00 -5.23e-02 4.74e-01 0 0.854 11 1.05e-13
La_IT_PSD_trimmean25 5.46e-07 1.23e-06 -4.16e-07 8.51e-07 0 0.831 + 1.000IT_PSD_trimmean25 -1.000IT_VLF_std 0 4.58e-13
IT_PSD_baseline 1.34e+00 1.01e+00 1.05e-01 8.09e-01 0 0.824 4 4.70e-117
IT_BRV_kurtosis -1.29e+00 8.77e-02 -1.30e+00 2.55e-03 0 0.820 NA 1.60e-26
IT_MF_kurtosis 1.10e-01 1.65e-01 1.88e-01 8.10e-03 0 0.790 NA 5.11e-22
IT_p_Total_skewness -5.73e-02 2.76e-01 6.77e-02 1.06e-02 0 0.766 1 1.97e-18
La_IT_LF_std_D -7.37e-09 5.10e-07 -2.92e-07 4.00e-07 0 0.761 -1.000IT_VLF_std + 1.000IT_LF_std_D -1 4.64e-08